Package org.terrier.structures.indexing.singlepass.hadoop

Source Code of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunPostingIterator

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.uk
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is HadoopRunPostingIterator.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*  
*/
package org.terrier.structures.indexing.singlepass.hadoop;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Iterator;

import org.terrier.compression.BitInputStream;
import org.terrier.structures.indexing.singlepass.PostingInRun;
import org.terrier.structures.indexing.singlepass.RunIterator;

/**
* This class allows the iteration of over a postings within a run within
* the Hadoop framwork.
* @since 2.2
* @author Craig Macdonald
  *
*/
public class HadoopRunPostingIterator extends RunIterator {

  /** Runs To Be Merged */
  protected Iterator<MapEmittedPostingList> postingIterator;
 
  /** Map number */
  protected String mapNo;
  /** Term that we're processing */
  protected String term;
  /** The Split that the current posting comes from */
  protected int splitNo;

  /** Constructs a new RunPostingIterator.
    * @param postingClass is the name of the class to use to read the postings
    * @param runNo is the number of the run
    * @param _postingiterator is the iterator of reduce input data that we are going to loop through
    * @param _term is the term that this iterator is operating on
    */
  public HadoopRunPostingIterator(
      Class<? extends PostingInRun> postingClass,
      int runNo,
      Iterator<MapEmittedPostingList> _postingiterator,
      String _term, int numFields)
  throws Exception
  {
    super(postingClass, runNo, numFields);
    postingIterator = _postingiterator;
    term = _term;
    createPosting();
  }

  /** Move to the next posting */ 
  @Override
  public boolean hasNext() {
    return postingIterator.hasNext();
  }

  /** Return the next PostingInRun */
  @Override
  public PostingInRun next() {
    try{
      /** Current Posting List */
      final MapEmittedPostingList post = postingIterator.next();
      posting.setTerm(term);
      posting.setPostingSource(new BitInputStream(new ByteArrayInputStream(post.getArray())));
      posting.setDf(post.getDocumentFreq());
      posting.setTF(post.getTermFreq());
      mapNo = post.getMap();
      flushNo = post.getFlushNo();
      splitNo = post.getSplitNo();
    } catch (IOException ioe) {
      throw new Error(ioe);
    }
    return posting;
  }

  /** Returns the map that the current posting came from */
  public String getMapNo() { return mapNo; }

  /**
   * @return the splitNo
   */
  public int getSplitNo() {
    return splitNo;
  }
 
 
}
TOP

Related Classes of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunPostingIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.